Data Loading and Preparation
library(tidyverse)
library(lubridate)
library(ggplot2)
library(plotly)
library(DT)
library(kableExtra)
library(viridis)
library(gridExtra)
df <- read.csv("Collection Observations - Birds Beak-2.csv", stringsAsFactors = FALSE)
# Display the structure of the data
str(df)
## 'data.frame': 86 obs. of 15 variables:
## $ Date : chr "6/19/2025" "6/19/2025" "6/19/2025" "6/19/2025" ...
## $ Week : chr "1b" "1b" "1b" "1b" ...
## $ Site.. : int 1 2 3 4 3 2 4 1 2 1 ...
## $ Camera.. : int 1 2 1 2 1 2 1 2 1 2 ...
## $ Start.Time : chr "11:00:00 AM" "11:00:00 AM" "11:30:00 AM" "11:30:00 AM" ...
## $ End.Time : chr "11:30:00 AM" "11:30:00 AM" "12:00:00 PM" "12:00:00 PM" ...
## $ Weather : int 66 66 68 68 68 68 68 68 69 69 ...
## $ Conditions : chr "overcast" "overcast" "overcast" "overcast" ...
## $ Wind : num 2.5 2.5 5.5 5.5 4.8 4.8 4.5 4.5 6.1 6.1 ...
## $ Wind.direction : chr "S " "S " "SW" "SW" ...
## $ Activity.on.site: chr "none" "none" "none" "none" ...
## $ Activity.around : chr "none" "none" "none" "small bird flying over plant, landing on stakes" ...
## $ Video.Duration : chr "30 min" "30 min" "30 min" "9 min" ...
## $ Shift.type : chr "Morning" "Morning" "Morning" "Morning" ...
## $ Notes : chr "" "" "" "?" ...
head(df)
## Date Week Site.. Camera.. Start.Time End.Time Weather Conditions
## 1 6/19/2025 1b 1 1 11:00:00 AM 11:30:00 AM 66 overcast
## 2 6/19/2025 1b 2 2 11:00:00 AM 11:30:00 AM 66 overcast
## 3 6/19/2025 1b 3 1 11:30:00 AM 12:00:00 PM 68 overcast
## 4 6/19/2025 1b 4 2 11:30:00 AM 12:00:00 PM 68 overcast
## 5 6/19/2025 1b 3 1 12:00:00 PM 12:30:00 PM 68 overcast
## 6 6/19/2025 1b 2 2 12:00:00 PM 12:30:00 PM 68 overcast
## Wind Wind.direction Activity.on.site
## 1 2.5 S none
## 2 2.5 S none
## 3 5.5 SW none
## 4 5.5 SW none
## 5 4.8 SW none
## 6 4.8 SW none
## Activity.around Video.Duration Shift.type
## 1 none 30 min Morning
## 2 none 30 min Morning
## 3 none 30 min Morning
## 4 small bird flying over plant, landing on stakes 9 min Morning
## 5 small bird flying over plant, landing on stakes 30 min Mid
## 6 none 13.07 min Mid
## Notes
## 1
## 2
## 3
## 4 ?
## 5
## 6 ?
Data Cleaning and Processing
# Clean and process the data
df_clean <- df %>%
# Remove empty rows
filter(!is.na(Date) & Date != "") %>%
# Convert date and time columns
mutate(
Date = mdy(Date),
Start.Time = hms(Start.Time),
End.Time = hms(End.Time),
# Extract numeric values from wind speed
Wind = as.numeric(Wind),
# Clean temperature (assuming it's in Fahrenheit)
Temperature = as.numeric(Weather),
# Create a datetime column for start time
DateTime = ymd_hms(paste(Date, Start.Time)),
# Extract bee counts from Activity columns
Bombus_on_site = case_when(
str_detect(tolower(Activity.on.site), "bombus") ~ str_extract_all(Activity.on.site, "\\d+") %>%
map_chr(~ifelse(length(.x) > 0, .x[1], "1")),
str_detect(tolower(Activity.on.site), "none|na") ~ "0",
TRUE ~ "0"
),
Bombus_around = case_when(
str_detect(tolower(Activity.around), "bombus") ~ str_extract_all(Activity.around, "\\d+") %>%
map_chr(~ifelse(length(.x) > 0, .x[1], "1")),
str_detect(tolower(Activity.around), "none|na") ~ "0",
TRUE ~ "0"
),
# Convert to numeric
Bombus_on_site = as.numeric(Bombus_on_site),
Bombus_around = as.numeric(Bombus_around),
# Total bombus count
Total_Bombus = Bombus_on_site + Bombus_around,
# Create week number
Week_num = as.numeric(str_extract(Week, "\\d+")),
# Convert video duration to minutes
Video_Duration_min = case_when(
str_detect(Video.Duration, "min") ~ as.numeric(str_extract(Video.Duration, "\\d+\\.?\\d*")),
TRUE ~ 30 # Default to 30 minutes
)
) %>%
# Filter out rows with missing essential data
filter(!is.na(Date), !is.na(Site..))
# Display cleaned data summary
summary(df_clean)
## Date Week Site.. Camera..
## Min. :2025-06-19 Length:82 Min. :1.00 Min. :1.0
## 1st Qu.:2025-06-23 Class :character 1st Qu.:1.25 1st Qu.:1.0
## Median :2025-06-26 Mode :character Median :2.50 Median :1.5
## Mean :2025-06-27 Mean :2.50 Mean :1.5
## 3rd Qu.:2025-07-03 3rd Qu.:3.75 3rd Qu.:2.0
## Max. :2025-07-09 Max. :4.00 Max. :2.0
##
## Start.Time End.Time Weather
## Min. :1H 0M 0S Min. :1H 0M 0S Min. :61.00
## 1st Qu.:3H 0M 0S 1st Qu.:3H 0M 0S 1st Qu.:65.00
## Median :5H 55M 0S Median :4H 30M 0S Median :68.00
## Mean :6H 38M 45S Mean :6H 24M 24.4444444444453S Mean :67.32
## 3rd Qu.:10H 7M 30S 3rd Qu.:10H 30M 0S 3rd Qu.:69.00
## Max. :12H 30M 0S Max. :12H 30M 0S Max. :73.00
## NA's :2 NA's :1
## Conditions Wind Wind.direction Activity.on.site
## Length:82 Min. :2.100 Length:82 Length:82
## Class :character 1st Qu.:2.900 Class :character Class :character
## Mode :character Median :4.500 Mode :character Mode :character
## Mean :4.698
## 3rd Qu.:6.600
## Max. :8.400
##
## Activity.around Video.Duration Shift.type Notes
## Length:82 Length:82 Length:82 Length:82
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Temperature DateTime Bombus_on_site
## Min. :61.00 Min. :2025-06-19 03:00:00 Min. :0.00000
## 1st Qu.:65.00 1st Qu.:2025-06-23 05:00:00 1st Qu.:0.00000
## Median :68.00 Median :2025-06-25 08:15:00 Median :0.00000
## Mean :67.32 Mean :2025-06-27 20:26:45 Mean :0.09756
## 3rd Qu.:69.00 3rd Qu.:2025-07-03 09:37:30 3rd Qu.:0.00000
## Max. :73.00 Max. :2025-07-09 11:30:00 Max. :2.00000
## NA's :2
## Bombus_around Total_Bombus Week_num Video_Duration_min
## Min. :0.0000 Min. :0.0000 Min. :1.000 Min. : 2.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:30.00
## Median :0.0000 Median :0.0000 Median :2.000 Median :30.00
## Mean :0.1341 Mean :0.2317 Mean :2.439 Mean :27.56
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:3.000 3rd Qu.:30.00
## Max. :2.0000 Max. :3.0000 Max. :4.000 Max. :30.00
##
Exploratory Data Analysis
Data Overview
# Basic statistics table
overview_stats <- df_clean %>%
summarise(
Total_Observations = n(),
Unique_Sites = n_distinct(Site..),
Date_Range = paste(min(Date, na.rm = TRUE), "to", max(Date, na.rm = TRUE)),
Total_Bombus_Observed = sum(Total_Bombus, na.rm = TRUE),
Avg_Bombus_per_Session = round(mean(Total_Bombus, na.rm = TRUE), 2),
Sessions_with_Bombus = sum(Total_Bombus > 0, na.rm = TRUE),
Percent_Sessions_with_Bombus = round((sum(Total_Bombus > 0, na.rm = TRUE) / n()) * 100, 1)
)
kable(overview_stats, caption = "Dataset Overview") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Dataset Overview
|
Total_Observations
|
Unique_Sites
|
Date_Range
|
Total_Bombus_Observed
|
Avg_Bombus_per_Session
|
Sessions_with_Bombus
|
Percent_Sessions_with_Bombus
|
|
82
|
4
|
2025-06-19 to 2025-07-09
|
19
|
0.23
|
11
|
13.4
|
Bombus Activity Over Time
# Daily bombus activity
daily_activity <- df_clean %>%
group_by(Date) %>%
summarise(
Total_Bombus = sum(Total_Bombus, na.rm = TRUE),
Sessions = n(),
Avg_Bombus = round(Total_Bombus / Sessions, 2),
.groups = 'drop'
)
# Time series plot
p1 <- ggplot(daily_activity, aes(x = Date, y = Total_Bombus)) +
geom_line(color = "steelblue", size = 1) +
geom_point(color = "darkblue", size = 2) +
labs(title = "Daily Bombus Activity",
x = "Date",
y = "Total Bombus Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Average bombus per session
p2 <- ggplot(daily_activity, aes(x = Date, y = Avg_Bombus)) +
geom_line(color = "orange", size = 1) +
geom_point(color = "darkorange", size = 2) +
labs(title = "Average Bombus per Session",
x = "Date",
y = "Average Bombus Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
grid.arrange(p1, p2, ncol = 1)

Bombus Activity by Site
# Bombus activity by site
site_activity <- df_clean %>%
group_by(Site..) %>%
summarise(
Total_Bombus = sum(Total_Bombus, na.rm = TRUE),
Sessions = n(),
Avg_Bombus = round(Total_Bombus / Sessions, 2),
Sessions_with_Bombus = sum(Total_Bombus > 0, na.rm = TRUE),
Percent_with_Bombus = round((Sessions_with_Bombus / Sessions) * 100, 1),
.groups = 'drop'
)
# Bar plot of total bombus by site
p3 <- ggplot(site_activity, aes(x = factor(Site..), y = Total_Bombus, fill = factor(Site..))) +
geom_bar(stat = "identity", alpha = 0.7) +
scale_fill_viridis_d() +
labs(title = "Total Bombus Count by Site",
x = "Site Number",
y = "Total Bombus Count") +
theme_minimal() +
theme(legend.position = "none")
# Percentage of sessions with bombus by site
p4 <- ggplot(site_activity, aes(x = factor(Site..), y = Percent_with_Bombus, fill = factor(Site..))) +
geom_bar(stat = "identity", alpha = 0.7) +
scale_fill_viridis_d() +
labs(title = "Percentage of Sessions with Bombus by Site",
x = "Site Number",
y = "Percentage of Sessions") +
theme_minimal() +
theme(legend.position = "none")
grid.arrange(p3, p4, ncol = 1)

# Display site statistics table
kable(site_activity, caption = "Bombus Activity by Site") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Bombus Activity by Site
|
Site..
|
Total_Bombus
|
Sessions
|
Avg_Bombus
|
Sessions_with_Bombus
|
Percent_with_Bombus
|
|
1
|
4
|
21
|
0.19
|
1
|
4.8
|
|
2
|
6
|
20
|
0.30
|
1
|
5.0
|
|
3
|
4
|
20
|
0.20
|
1
|
5.0
|
|
4
|
5
|
21
|
0.24
|
1
|
4.8
|
Environmental Factors and Bombus Activity
# Temperature vs Bombus activity
temp_analysis <- df_clean %>%
filter(!is.na(Temperature)) %>%
group_by(Temperature) %>%
summarise(
Avg_Bombus = mean(Total_Bombus, na.rm = TRUE),
Sessions = n(),
.groups = 'drop'
)
p5 <- ggplot(df_clean, aes(x = Temperature, y = Total_Bombus)) +
geom_point(alpha = 0.6, color = "darkgreen") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "Temperature vs Bombus Activity",
x = "Temperature (°F)",
y = "Total Bombus Count") +
theme_minimal()
# Wind speed vs Bombus activity
p6 <- ggplot(df_clean, aes(x = Wind, y = Total_Bombus)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "Wind Speed vs Bombus Activity",
x = "Wind Speed",
y = "Total Bombus Count") +
theme_minimal()
grid.arrange(p5, p6, ncol = 1)

Time of Day Analysis
# Convert start time to hours for analysis
df_clean <- df_clean %>%
mutate(
Start_Hour = hour(Start.Time),
Time_Period = case_when(
Start_Hour < 12 ~ "Morning",
Start_Hour < 15 ~ "Midday",
TRUE ~ "Afternoon"
)
)
# Bombus activity by time period
time_activity <- df_clean %>%
group_by(Time_Period) %>%
summarise(
Total_Bombus = sum(Total_Bombus, na.rm = TRUE),
Sessions = n(),
Avg_Bombus = round(Total_Bombus / Sessions, 2),
.groups = 'drop'
)
p7 <- ggplot(time_activity, aes(x = Time_Period, y = Avg_Bombus, fill = Time_Period)) +
geom_bar(stat = "identity", alpha = 0.7) +
scale_fill_viridis_d() +
labs(title = "Average Bombus Activity by Time Period",
x = "Time Period",
y = "Average Bombus Count") +
theme_minimal() +
theme(legend.position = "none")
# Hour-by-hour activity
hourly_activity <- df_clean %>%
group_by(Start_Hour) %>%
summarise(
Total_Bombus = sum(Total_Bombus, na.rm = TRUE),
Sessions = n(),
Avg_Bombus = round(Total_Bombus / Sessions, 2),
.groups = 'drop'
)
p8 <- ggplot(hourly_activity, aes(x = Start_Hour, y = Avg_Bombus)) +
geom_line(color = "purple", size = 1) +
geom_point(color = "blue", size = 2) +
labs(title = "Bombus Activity Throughout the Day",
x = "Hour of Day",
y = "Average Bombus Count") +
theme_minimal() +
scale_x_continuous(breaks = seq(9, 18, 1))
grid.arrange(p7, p8, ncol = 1)

Weather Conditions Analysis
# Clean weather conditions
df_clean <- df_clean %>%
mutate(
Weather_Clean = case_when(
str_detect(tolower(Conditions), "sunny") ~ "Sunny",
str_detect(tolower(Conditions), "overcast") ~ "Overcast",
str_detect(tolower(Conditions), "cloudy") ~ "Cloudy",
TRUE ~ "Other"
)
)
# Bombus activity by weather condition
weather_activity <- df_clean %>%
group_by(Weather_Clean) %>%
summarise(
Total_Bombus = sum(Total_Bombus, na.rm = TRUE),
Sessions = n(),
Avg_Bombus = round(Total_Bombus / Sessions, 2),
Sessions_with_Bombus = sum(Total_Bombus > 0, na.rm = TRUE),
Percent_with_Bombus = round((Sessions_with_Bombus / Sessions) * 100, 1),
.groups = 'drop'
)
p9 <- ggplot(weather_activity, aes(x = Weather_Clean, y = Avg_Bombus, fill = Weather_Clean)) +
geom_bar(stat = "identity", alpha = 0.7) +
scale_fill_viridis_d() +
labs(title = "Average Bombus Activity by Weather Condition",
x = "Weather Condition",
y = "Average Bombus Count") +
theme_minimal() +
theme(legend.position = "none")
print(p9)

# Display weather statistics table
kable(weather_activity, caption = "Bombus Activity by Weather Condition") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Bombus Activity by Weather Condition
|
Weather_Clean
|
Total_Bombus
|
Sessions
|
Avg_Bombus
|
Sessions_with_Bombus
|
Percent_with_Bombus
|
|
Cloudy
|
0
|
10
|
0.00
|
0
|
0.0
|
|
Overcast
|
4
|
26
|
0.15
|
1
|
3.8
|
|
Sunny
|
15
|
46
|
0.33
|
1
|
2.2
|
Weekly Progress Analysis
# Weekly bombus activity
weekly_activity <- df_clean %>%
group_by(Week_num) %>%
summarise(
Total_Bombus = sum(Total_Bombus, na.rm = TRUE),
Sessions = n(),
Avg_Bombus = round(Total_Bombus / Sessions, 2),
Sessions_with_Bombus = sum(Total_Bombus > 0, na.rm = TRUE),
Percent_with_Bombus = round((Sessions_with_Bombus / Sessions) * 100, 1),
.groups = 'drop'
)
p10 <- ggplot(weekly_activity, aes(x = Week_num, y = Total_Bombus)) +
geom_line(color = "darkgreen", size = 1) +
geom_point(color = "darkgreen", size = 3) +
labs(title = "Weekly Bombus Activity Trend",
x = "Week Number",
y = "Total Bombus Count") +
theme_minimal() +
scale_x_continuous(breaks = unique(weekly_activity$Week_num))
print(p10)

# Display weekly statistics table
kable(weekly_activity, caption = "Weekly Bombus Activity Summary") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Weekly Bombus Activity Summary
|
Week_num
|
Total_Bombus
|
Sessions
|
Avg_Bombus
|
Sessions_with_Bombus
|
Percent_with_Bombus
|
|
1
|
7
|
16
|
0.44
|
1
|
6.2
|
|
2
|
0
|
34
|
0.00
|
0
|
0.0
|
|
3
|
8
|
12
|
0.67
|
1
|
8.3
|
|
4
|
4
|
20
|
0.20
|
1
|
5.0
|
Data Quality Assessment
# Check for missing data and technical issues
quality_check <- df_clean %>%
mutate(
Has_Technical_Issue = str_detect(tolower(Notes), "died|overheated|memory|card|froze"),
Full_Duration = Video_Duration_min >= 29 # Sessions that recorded close to full 30 minutes
) %>%
summarise(
Total_Sessions = n(),
Sessions_with_Issues = sum(Has_Technical_Issue, na.rm = TRUE),
Percent_with_Issues = round((Sessions_with_Issues / Total_Sessions) * 100, 1),
Full_Duration_Sessions = sum(Full_Duration, na.rm = TRUE),
Percent_Full_Duration = round((Full_Duration_Sessions / Total_Sessions) * 100, 1),
Missing_Bombus_Data = sum(is.na(Total_Bombus)),
.groups = 'drop'
)
kable(quality_check, caption = "Data Quality Assessment") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Data Quality Assessment
|
Total_Sessions
|
Sessions_with_Issues
|
Percent_with_Issues
|
Full_Duration_Sessions
|
Percent_Full_Duration
|
Missing_Bombus_Data
|
|
82
|
18
|
22
|
68
|
82.9
|
0
|
# Technical issues by date
technical_issues <- df_clean %>%
mutate(Has_Technical_Issue = str_detect(tolower(Notes), "died|overheated|memory|card|froze")) %>%
group_by(Date) %>%
summarise(
Total_Sessions = n(),
Sessions_with_Issues = sum(Has_Technical_Issue, na.rm = TRUE),
Percent_with_Issues = round((Sessions_with_Issues / Total_Sessions) * 100, 1),
.groups = 'drop'
) %>%
filter(Sessions_with_Issues > 0)
if(nrow(technical_issues) > 0) {
kable(technical_issues, caption = "Technical Issues by Date") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
}
Technical Issues by Date
|
Date
|
Total_Sessions
|
Sessions_with_Issues
|
Percent_with_Issues
|
|
2025-06-19
|
12
|
4
|
33.3
|
|
2025-06-20
|
4
|
4
|
100.0
|
|
2025-06-23
|
12
|
4
|
33.3
|
|
2025-06-26
|
10
|
2
|
20.0
|
|
2025-07-03
|
12
|
2
|
16.7
|
|
2025-07-07
|
12
|
2
|
16.7
|
Interactive Data Exploration
# Create interactive plot for bombus activity over time
interactive_plot <- df_clean %>%
plot_ly(x = ~DateTime, y = ~Total_Bombus,
color = ~factor(Site..),
size = ~Video_Duration_min,
text = ~paste("Site:", Site.., "<br>",
"Date:", Date, "<br>",
"Time:", Start.Time, "<br>",
"Bombus Count:", Total_Bombus, "<br>",
"Weather:", Weather_Clean, "<br>",
"Temperature:", Temperature, "°F", "<br>",
"Wind Speed:", Wind),
hovertemplate = "%{text}<extra></extra>") %>%
add_markers() %>%
layout(title = "Interactive Bombus Activity Timeline",
xaxis = list(title = "Date and Time"),
yaxis = list(title = "Total Bombus Count"),
showlegend = TRUE)
interactive_plot
Summary and Recommendations
Key Findings
# Calculate key metrics for summary
summary_metrics <- df_clean %>%
summarise(
Peak_Activity_Date = Date[which.max(Total_Bombus)],
Peak_Activity_Count = max(Total_Bombus, na.rm = TRUE),
Most_Active_Site = Site..[which.max(Total_Bombus)],
Best_Weather = weather_activity$Weather_Clean[which.max(weather_activity$Avg_Bombus)],
Optimal_Temperature = round(mean(df_clean$Temperature[df_clean$Total_Bombus > 0], na.rm = TRUE), 1),
Best_Time_Period = time_activity$Time_Period[which.max(time_activity$Avg_Bombus)],
.groups = 'drop'
)
cat("## Key Findings:\n\n")
## ## Key Findings:
cat("- **Peak Activity Date:** ", as.character(summary_metrics$Peak_Activity_Date),
" (", summary_metrics$Peak_Activity_Count, " bombus observed)\n")
## - **Peak Activity Date:** 2025-07-03 ( 3 bombus observed)
cat("- **Most Active Site:** Site ", summary_metrics$Most_Active_Site, "\n")
## - **Most Active Site:** Site 4
cat("- **Best Weather Conditions:** ", summary_metrics$Best_Weather, "\n")
## - **Best Weather Conditions:** Sunny
cat("- **Optimal Temperature:** ", summary_metrics$Optimal_Temperature, "°F\n")
## - **Optimal Temperature:** 69.9 °F
cat("- **Best Time Period:** ", summary_metrics$Best_Time_Period, "\n")
## - **Best Time Period:** Morning
cat("- **Overall Success Rate:** ", overview_stats$Percent_Sessions_with_Bombus, "% of sessions had bombus activity\n")
## - **Overall Success Rate:** 13.4 % of sessions had bombus activity
Recommendations for Future Data Collection
Based on the analysis, here are recommendations for your future data
collection and AI model training:
- Optimal Collection Conditions:
- Focus on sunny weather conditions when possible
- Target the most productive time periods identified in the
analysis
- Consider environmental factors like temperature and wind speed
- Site Strategy:
- Prioritize sites with higher bombus activity rates
- Consider expanding monitoring at successful sites
- Technical Improvements:
- Address camera reliability issues (overheating, memory cards)
- Ensure consistent 30-minute recording sessions
- Consider backup equipment for high-activity periods
- Data for AI Training:
- Current dataset provides good baseline for training data
- Include environmental metadata with video annotations
- Consider balancing dataset with negative examples (no bombus
activity)
- Future Analysis:
- Track seasonal patterns as data accumulates
- Correlate with plant phenology data if available
- Consider pollen load analysis for bombus specimens
Analysis completed on 2025-07-09